{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1ba817ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re  # 正则表达式提取文本\n",
    "from jsonpath import jsonpath  # 解析json数据\n",
    "import requests  # 发送请求\n",
    "import pandas as pd  # 存取csv文件\n",
    "import datetime  # 转换时间用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1963c7f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 请求头\n",
    "headers = {\n",
    "    \"User-Agent\": \"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Mobile Safari/537.36\",\n",
    "    \"accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\",\n",
    "    \"accept-encoding\": \"gzip, deflate, br\",\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7f374420",
   "metadata": {},
   "outputs": [],
   "source": [
    "def trans_time(v_str):\n",
    "    \"\"\"转换GMT时间为标准格式\"\"\"\n",
    "    GMT_FORMAT = '%a %b %d %H:%M:%S +0800 %Y'\n",
    "    timeArray = datetime.datetime.strptime(v_str, GMT_FORMAT)\n",
    "    ret_time = timeArray.strftime(\"%Y-%m-%d %H:%M:%S\")\n",
    "    return ret_time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "370d2a7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def getLongText(v_id):\n",
    "    \"\"\"爬取长微博全文\"\"\"\n",
    "    url = 'https://m.weibo.cn/statuses/extend?id=' + str(v_id)\n",
    "    r = requests.get(url, headers=headers)\n",
    "    json_data = r.json()\n",
    "    long_text = json_data['data']['longTextContent']\n",
    "    # 微博内容-正则表达式数据清洗\n",
    "    dr = re.compile(r'<[^>]+>', re.S)\n",
    "    long_text2 = dr.sub('', long_text)\n",
    "    # print(long_text2)\n",
    "    return long_text2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "82bb9596",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_weibo_list(v_keyword, v_max_page):\n",
    "    \"\"\"\n",
    "    爬取微博内容列表\n",
    "    :param v_keyword: 搜索关键字\n",
    "    :param v_max_page: 爬取前几页\n",
    "    :return: None\n",
    "    \"\"\"\n",
    "    for page in range(2, v_max_page + 1):\n",
    "        print('===开始爬取第{}页微博==='.format(page))\n",
    "        # 请求地址\n",
    "        url = 'https://m.weibo.cn/api/container/getIndex'\n",
    "        # 请求参数\n",
    "        params = {\n",
    "            \"containerid\": \"100103type=1&q={}\".format(v_keyword),\n",
    "            \"page_type\": \"searchall\",\n",
    "            \"page\": page\n",
    "        }\n",
    "        # 发送请求\n",
    "        r = requests.get(url, headers=headers, params=params)\n",
    "        print(r.status_code)\n",
    "        # pprint(r.json())\n",
    "        # 解析json数据\n",
    "        cards = r.json()[\"data\"][\"cards\"]\n",
    "        print(len(cards))\n",
    "        region_name_list = []\n",
    "        status_city_list = []\n",
    "        status_province_list = []\n",
    "        status_country_list = []\n",
    "        for card in cards:\n",
    "            # 发布于\n",
    "            try:\n",
    "                region_name = card['card_group'][0]['mblog']['region_name']\n",
    "                region_name_list.append(region_name)\n",
    "            except:\n",
    "                region_name_list.append('')\n",
    "            # ip属地_城市\n",
    "            try:\n",
    "                status_city = card['card_group'][0]['mblog']['status_city']\n",
    "                status_city_list.append(status_city)\n",
    "            except:\n",
    "                status_city_list.append('')\n",
    "            # ip属地_省份\n",
    "            try:\n",
    "                status_province = card['card_group'][0]['mblog']['status_province']\n",
    "                status_province_list.append(status_province)\n",
    "            except:\n",
    "                status_province_list.append('')\n",
    "            # ip属地_国家\n",
    "            try:\n",
    "                status_country = card['card_group'][0]['mblog']['status_country']\n",
    "                status_country_list.append(status_country)\n",
    "            except:\n",
    "                status_country_list.append('')\n",
    "        # 微博内容\n",
    "        text_list = jsonpath(cards, '$..mblog.text')\n",
    "        # 微博内容-正则表达式数据清洗\n",
    "        dr = re.compile(r'<[^>]+>', re.S)\n",
    "        text2_list = []\n",
    "        print('text_list is:')\n",
    "        # print(text_list)\n",
    "        if not text_list:  # 如果未获取到微博内容，进入下一轮循环\n",
    "            continue\n",
    "        if type(text_list) == list and len(text_list) > 0:\n",
    "            for text in text_list:\n",
    "                text2 = dr.sub('', text)  # 正则表达式提取微博内容\n",
    "                # print(text2)\n",
    "                text2_list.append(text2)\n",
    "        # 微博创建时间\n",
    "        time_list = jsonpath(cards, '$..mblog.created_at')\n",
    "        time_list = [trans_time(v_str=i) for i in time_list]\n",
    "        # 微博作者\n",
    "        author_list = jsonpath(cards, '$..mblog.user.screen_name')\n",
    "        # 微博id\n",
    "        id_list = jsonpath(cards, '$..mblog.id')\n",
    "        # 判断是否存在全文\n",
    "        isLongText_list = jsonpath(cards, '$..mblog.isLongText')\n",
    "        idx = 0\n",
    "        for i in isLongText_list:\n",
    "            if i == True:\n",
    "                long_text = getLongText(v_id=id_list[idx])\n",
    "                text2_list[idx] = long_text\n",
    "            idx += 1\n",
    "        # 转发数\n",
    "        reposts_count_list = jsonpath(cards, '$..mblog.reposts_count')\n",
    "        # 评论数\n",
    "        comments_count_list = jsonpath(cards, '$..mblog.comments_count')\n",
    "        # 点赞数\n",
    "        attitudes_count_list = jsonpath(cards, '$..mblog.attitudes_count')\n",
    "        # 把列表数据保存成DataFrame数据\n",
    "        print('id_list:', len(id_list))\n",
    "        print(len(time_list))\n",
    "        print('region_name_list:', len(region_name_list))\n",
    "        print(len(status_city_list))\n",
    "        print(len(status_province_list))\n",
    "        print(len(status_country_list))\n",
    "\n",
    "        df = pd.DataFrame(\n",
    "            {\n",
    "                '页码': [page] * len(id_list),\n",
    "                '微博id': id_list,\n",
    "                '微博作者': author_list,\n",
    "                '发布时间': time_list,\n",
    "                '微博内容': text2_list,\n",
    "                '转发数': reposts_count_list,\n",
    "                '评论数': comments_count_list,\n",
    "                '点赞数': attitudes_count_list,\n",
    "                '发布于': region_name_list,\n",
    "                'ip属地_城市': status_city_list,\n",
    "                'ip属地_省份': status_province_list,\n",
    "                'ip属地_国家': status_country_list,\n",
    "            }\n",
    "        )\n",
    "        # 表头\n",
    "        if os.path.exists(v_weibo_file):\n",
    "            header = None\n",
    "        else:\n",
    "            header = ['页码', '微博id', '微博作者', '发布时间', '微博内容', '转发数', '评论数',\n",
    "                      '点赞数', '发布于', 'ip属地_城市', 'ip属地_省份', 'ip属地_国家']  # csv文件头\n",
    "        # 保存到csv文件\n",
    "        df.to_csv(v_weibo_file, mode='a+', index=False,\n",
    "                  header=header, encoding='utf_8_sig')\n",
    "        print('csv保存成功:{}'.format(v_weibo_file))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e03813c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 爬取前几页\n",
    "max_search_page = 100  # 爬前n页\n",
    "# 爬取关键字\n",
    "search_keyword = '世界杯'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "312160ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存文件名\n",
    "v_weibo_file = '微博清单_{}_前{}页.csv'.format(search_keyword, max_search_page)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "efc3dfee",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果csv文件存在，先删除之\n",
    "if os.path.exists(v_weibo_file):\n",
    "    os.remove(v_weibo_file)\n",
    "    print('微博清单存在，已删除: {}'.format(v_weibo_file))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "57f016a3",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "===开始爬取第2页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第3页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第4页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第5页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第6页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第7页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第8页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第9页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第10页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第11页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第12页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第13页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第14页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第15页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第16页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第17页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第18页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第19页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第20页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第21页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第22页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第23页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第24页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第25页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第26页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第27页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第28页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第29页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第30页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第31页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第32页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第33页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第34页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第35页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第36页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第37页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第38页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第39页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第40页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第41页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第42页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第43页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第44页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第45页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第46页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第47页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第48页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第49页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第50页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第51页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第52页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第53页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第54页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第55页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第56页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第57页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第58页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第59页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第60页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第61页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第62页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第63页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第64页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第65页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第66页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第67页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第68页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第69页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第70页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第71页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第72页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第73页微博===\n",
      "200\n",
      "8\n",
      "text_list is:\n",
      "id_list: 8\n",
      "8\n",
      "region_name_list: 8\n",
      "8\n",
      "8\n",
      "8\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第74页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第75页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第76页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第77页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第78页微博===\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第79页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第80页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第81页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第82页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第83页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第84页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第85页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第86页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第87页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第88页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第89页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第90页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第91页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第92页微博===\n",
      "200\n",
      "8\n",
      "text_list is:\n",
      "id_list: 8\n",
      "8\n",
      "region_name_list: 8\n",
      "8\n",
      "8\n",
      "8\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第93页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第94页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第95页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第96页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第97页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第98页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第99页微博===\n",
      "200\n",
      "9\n",
      "text_list is:\n",
      "id_list: 9\n",
      "9\n",
      "region_name_list: 9\n",
      "9\n",
      "9\n",
      "9\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n",
      "===开始爬取第100页微博===\n",
      "200\n",
      "10\n",
      "text_list is:\n",
      "id_list: 10\n",
      "10\n",
      "region_name_list: 10\n",
      "10\n",
      "10\n",
      "10\n",
      "csv保存成功:微博清单_世界杯_前100页.csv\n"
     ]
    }
   ],
   "source": [
    "# 调用爬取微博函数\n",
    "get_weibo_list(v_keyword=search_keyword, v_max_page=max_search_page)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "646e48ce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据清洗完成\n"
     ]
    }
   ],
   "source": [
    "# 数据清洗-去重\n",
    "df = pd.read_csv(v_weibo_file)\n",
    "# 删除重复数据\n",
    "df.drop_duplicates(subset=['微博id'], inplace=True, keep='first')\n",
    "# 再次保存csv文件\n",
    "df.to_csv(v_weibo_file, index=False, encoding='utf_8_sig')\n",
    "print('数据清洗完成')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b6c0ee4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
